#### REPLICATION SCRIPT: QUO VADIS DIPLOMATS

# libraries
library(foreign)
library(tidyverse)
library(broom)
library(reshape2)
library(car)
library(Hmisc)
library(plm)
library(stargazer)
library(MASS)
library(vcd)
library(AER)

# Codebook
##  iso3: ISO-3 country code
##  rankmean: post hierarchial ranking (a-d), national mean ('rank' variable in csRank is re-scaled 1-4 integer)
##  spRelct: count of number of bilateral agreements with BR
##  undp_hdim: mean HDI score
##  nPers: number of personnel
##  contig: dummy for geogr. contiguity
##  distw: distance between capitals
##  n_igos: number of intergovernmental organizations HQs
##  nBrRes: number of Brazilian residents
##  emb12: country score in post tradition (12 point scale)
##  lang_ob: dummy for official language PT, ENG, FR or SPA
##  code for transformations: _ln (log()), _bi (billions), _l1 _l2 (lagged value)
##  t: time trend



# SECTION 4.2: DETERMINANTS OF POST RANKING

# Cross-section with mean 2008-2015 values for post ranking and MRE criteria
csRank <- read.csv(file="CSrank.csv")
csRank$rank <- as.factor(csRank$rank)

# Ordinal Logistic Regression
## formulas
rankForm <- rank ~ gdp_bi_ln + spRelct_ln + undp_hdim_ln
rankForms <- rank ~ scale(gdp_bi_ln) + scale(spRelct_ln) + scale(undp_hdim_ln)

## regressions
polr1 <- polr(rankForm, data=csRank, Hess=T) #un-standardized values
summary(polr1)
pval1 <- pnorm(abs(coef(summary(polr1))[, "t value"]), lower.tail = FALSE) * 2
cbind(coef(summary(polr1)), "p value" = round(pval1, 3))
confint(polr1)
### OR and CI
or1 <- exp(cbind(OR = coef(polr1), confint(polr1)))
or1

polr1s <- polr(rankForms, data=csRank, Hess=T) # standardized independent variables (replicates Table 3)
summary(polr1s)
pval1s <- pnorm(abs(coef(summary(polr1s))[, "t value"]), lower.tail = FALSE) * 2
cbind(coef(summary(polr1s)), "p value" = round(pval1s, 3))
confint(polr1s)
### OR and CI
or1s <- exp(cbind(OR = coef(polr1s), confint(polr1s)))
or1s

# Plots (replicates Figure 3)
## Histograms for empirical count of cases
ggplot(csRank, aes(x=undp_hdim_ln, fill=rank))+
  geom_histogram(position = "identity", color="black")+
  scale_fill_manual(values=c("black", "gray40", "gray80", "gray99"))+
  ylim(0,10)+
  labs(title="HDI histogram", x="HDI (log)", y="Count", fill="Observed\nRank")

ggplot(csRank, aes(x=gdp_bi_ln, fill=rank))+
  geom_histogram(position = "identity", color="black")+
  scale_fill_manual(values=c("black", "gray40", "gray80", "gray99"))+
  ylim(0,10)+
  labs(title="GDP histogram", x="GDP (bi, log)", y="Count", fill="Observed\nRank")
dev.off()

ggplot(csRank, aes(x=spRelct_ln, fill=rank))+
  geom_histogram(position = "identity", color="black")+
  scale_fill_manual(values=c("black", "gray40", "gray80", "gray99"))+
  ylim(0,10)+
  labs(title="N. Agreem. histogram", x="N. Agreem. (log)", y="Count", fill="Observed\nRank")

## Probability curves
plotdata1 <- csRank[complete.cases(csRank[,c(5,7,9)]),c(5,7,9)]
plotdata1 <- scale(plotdata1)
plotdata1 <- as.data.frame(plotdata1)
xbgdp <- plotdata1$gdp_bi_ln*coef(polr1s)[1]
xbsp <- plotdata1$spRelct_ln*coef(polr1s)[2]
xbhdi <- plotdata1$undp_hdim_ln*coef(polr1s)[3]

logistic_cdf <- function(x) {
  return( 1/(1+exp(-x) ) )
}

plotdatag <- as.data.frame(plotdata1$gdp_bi_ln)
colnames(plotdatag)[1] <- "gdp_bi_ln"
plotdatag$D <- logistic_cdf( -0.7578 - xbgdp )
plotdatag$C <- logistic_cdf( 3.5624 - xbgdp ) - logistic_cdf( -0.7578 - xbgdp )
plotdatag$B <- logistic_cdf( 6.4487 - xbgdp ) - logistic_cdf( 3.5624 - xbgdp )
plotdatag$A <- 1 - logistic_cdf( 3.5624 - xbgdp )

ggplot(melt(plotdatag, id.vars = c(colnames(plotdatag)[1]),
            variable.name = "predRank", value.name = "prob"),
       aes(x=gdp_bi_ln, y=prob, linetype=predRank))+geom_line(aes(linetype=predRank), size=1.2)+
  scale_linetype_manual(values=c("solid", "longdash", "dotdash", "dotted"))+
 ylim(c(0,1))+
  labs(title="GDP (log) x Probability", x="GDP (bi, log)", y="Probability", linetype="Predicted\nRank")

plotdatas <- as.data.frame(plotdata1$spRelct_ln)
colnames(plotdatas)[1] <- "spRelct_ln"
plotdatas$D <- logistic_cdf( -0.7578 - xbsp )
plotdatas$C <- logistic_cdf( 3.5624 - xbsp ) - logistic_cdf( -0.7578 - xbsp )
plotdatas$B <- logistic_cdf( 6.4487 - xbsp ) - logistic_cdf( 3.5624 - xbsp )
plotdatas$A <- 1 - logistic_cdf( 3.5624 - xbsp )

ggplot(melt(plotdatas, id.vars = c(colnames(plotdatas)[1]),
            variable.name = "predRank", value.name = "prob"),
       aes(x=spRelct_ln, y=prob, linetype=predRank))+geom_line(aes(linetype=predRank), size=1.2)+
  scale_linetype_manual(values=c("solid", "longdash", "dotdash", "dotted"))+
  ylim(c(0,1))+
  labs(title="N. Agreements (log) x Probability", x="N. Agreem. (log)", y="Probability", linetype="Predicted\nRank")

plotdatah <- as.data.frame(plotdata1$undp_hdim_ln)
colnames(plotdatah)[1] <- "undp_hdim_ln"
plotdatah$D <- logistic_cdf( -0.7578 - xbhdi )
plotdatah$C <- logistic_cdf( 3.5624 - xbhdi ) - logistic_cdf( -0.7578 - xbhdi )
plotdatah$B <- logistic_cdf( 6.4487 - xbhdi ) - logistic_cdf( 3.5624 - xbhdi )
plotdatah$A <- 1 - logistic_cdf( 3.5624 - xbhdi )

ggplot(melt(plotdatah, id.vars = c(colnames(plotdatah)[1]),
            variable.name = "predRank", value.name = "prob"),
       aes(x=undp_hdim_ln, y=prob, linetype=predRank))+geom_line(aes(linetype=predRank), size=1.2)+
  scale_linetype_manual(values=c("solid", "longdash", "dotdash", "dotted"))+
  ylim(c(0,1))+
  labs(title="HDI (log) x Probability", x="HDI (log)", y="Probability", linetype="Predicted\nRank")


# SECTION 4.3: DETERMINANTS OF DIPLOMATIC PRESENCE

# Cross-section with mean 2008-2015 values
cs <- read.csv(file="CSmean.csv")

# OLS regression
## formulas
olsForm1 <- nPers_ln ~ contig + distw + gdp_bi_ln + trade_ln + n_igos +
  spRelct_ln + nBrRes_ln + g7 + brics + mercosul + cplp +
  emb12 + rankmean + lang_ob # full model
olsForm1s <- scale(nPers_ln) ~ scale(contig) + scale(distw) + scale(gdp_bi_ln) + scale(trade_ln) + scale(n_igos) +
  scale(spRelct_ln) + scale(nBrRes_ln) + scale(g7) + scale(brics) + scale(mercosul) + scale(cplp) +
  scale(emb12) + scale(rankmean) + scale(lang_ob) # same as 1, both Y and X standardized
olsForm1ss <- nPers_ln ~ scale(contig) + scale(distw) + scale(gdp_bi_ln) + scale(trade_ln) + scale(n_igos) +
  scale(spRelct_ln) + scale(nBrRes_ln) + scale(g7) + scale(brics) + scale(mercosul) + scale(cplp) +
  scale(emb12) + scale(rankmean) + scale(lang_ob) # same as 1, only X standardized

olsForm2 <- nPers_ln ~ contig + distw + trade_ln + n_igos +
  spRelct_ln + nBrRes_ln + g7 + brics + mercosul + cplp +
  emb12 + lang_ob # reduced model, without gdp and rank

## regressions
olsMean <- lm(data=cs, olsForm1)
summary(olsMean) # regular OLS regression (Model 1)

olsMeans <- lm(data=cs, olsForm1s)
summary(olsMeans) # standardized coefficients, X and Y scaled

olsMeanss <- lm(data=cs, olsForm1ss)
summary(olsMeanss) # standardized coefficients, only X scaled

olsMean2 <- lm(data=cs, olsForm2) # reduced OLS regression (Model 2)

## diagnostics
qqPlot(olsMean)
hist(olsMean$residuals)
residualPlot(olsMean)
vif(olsMean) # gdb_bi_ln exceeds vif=10
infIndexPlot(olsMean, vars=c("Cook", "Studentized", "hat"), id.n = 3)
outlierTest(olsMean) # US does not qualify as outlier
bptest(olsMean) # no heteroskedasticity
ncvTest(olsMean) # error variance does not change with fitted-value range


# Negative binomial regression
csi <- cs
csi$nPers <- as.integer(csi$nPers) # transform into integer for nb regression

## Test for overdispersion
mean(csi$nPers)
var(csi$nPers) # 12.4 < 475.5. Overdispersed

## Poisson regression to test for overdispersion
poisMean <- glm(nPers ~ contig + distw + gdp_bi_ln + trade_ln + n_igos + spRelct_ln + 
                  nBrRes_ln + g7 + brics + mercosul + cplp + emb12 + rankmean + 
                  lang_ob, family=poisson, data=csi) # same form as olsForm1
summary(poisMean)
### dispersion test
dispersiontest(poisMean, trafo = NULL, alternative="greater")

## Negative binomial regression
nbMean <- glm.nb(nPers ~ contig + distw + 
                   gdp_bi_ln + trade_ln + n_igos +
                   spRelct_ln + nBrRes_ln +
                   g7 + brics + mercosul + cplp + 
                   emb12 + rankmean + lang_ob,
                 data=csi)
summary(nbMean)

nbMeans <- glm.nb(nPers ~ scale(contig) + scale(distw) + 
                    scale(gdp_bi_ln) + scale(trade_ln) + scale(n_igos) +
                    scale(spRelct_ln) + scale(nBrRes_ln) +
                    scale(g7) + scale(brics) + scale(mercosul) + scale(cplp) + 
                    scale(emb12) + scale(rankmean) + scale(lang_ob),
                  data=csi) #  standardized independent variables
summary(nbMeans)

nbMean2 <- glm.nb(nPers ~ contig + distw + 
                    trade_ln + n_igos + spRelct_ln + nBrRes_ln +
                    g7 + brics + mercosul + cplp + 
                    emb12 + lang_ob,
                  data=csi) # reduced model (no gdp and no rankmean)
summary(nbMean2)



# Panel data 2008-2015
pd <- read.csv(file="PD.csv")
pd <- pdata.frame(pd, index=c("iso3", "year")) # create pdataframe object

## centered version (only independent variables are centered)
pdc <- scale(as.matrix(pd[complete.cases(pd[,-c(4,5,7,8)]),-c(1:3,6)]))
pdc <- as.data.frame(pdc)
pdc <- cbind(pdc, pd[complete.cases(pd[,-c(4,5,7,8)]),c(1:3,6)])
pdc <- pdata.frame(pdc, index=c("iso3", "year"))

# Pooled OLS regression
## formulas
pdOlsForm1 <- nPers_ln ~ contig + distw +
  gdp_bi_ln + trade_ln + n_igos +
  spRelct_ln + nBrRes_ln +
  g7 + brics + mercosul + cplp +
  rankmean + lang_ob + emb12 + t + nPers_l1_ln # w/ lag1

pdOlsForm2 <- nPers_ln ~ contig + distw +
  gdp_bi_ln + trade_ln + n_igos +
  spRelct_ln + nBrRes_ln +
  g7 + brics + mercosul + cplp +
  rankmean + lang_ob + emb12 + t + nPers_l1_ln + as.factor(year) # w/ lag1 and year effects

pdOlsForm3 <- nPers_ln ~ contig + distw +
  gdp_bi_ln + trade_ln + n_igos +
  spRelct_ln + nBrRes_ln +
  g7 + brics + mercosul + cplp +
  rankmean + lang_ob + emb12 + t + nPers_l1_ln + nPers_l2_ln + as.factor(year) # w/ lag1+2 and year effects

pdOlsForm4 <- nPers_ln ~ contig + distw +
  trade_ln + n_igos +
  spRelct_ln + nBrRes_ln +
  g7 + brics + mercosul + cplp +
  lang_ob + emb12 + t + nPers_l1_ln + nPers_l2_ln + as.factor(year) # same as 3, reduced (no gdp and rank)



## Regressions
pdOls1 <- plm(pdOlsForm1, data=pd, model = "pooling")

## Test for unobserved heterogeneity
pwtest(pdOls1) # no individual effect
plmtest(pdOls1, effect="individual", type="bp") # no individual effect
plmtest(pdOls1, effect="time", type="bp") # significant time effects

## Regressions with time effects
pdOls2 <- plm(pdOlsForm2, data=pd, model = "pooling") # lag 1 + year effect
pdOls3 <- plm(pdOlsForm3, data=pd, model = "pooling") # lag1+2 + year effect
pdOls4 <- plm(pdOlsForm4, data=pd, model = "pooling") # reduced model

## Test for serial correlation
pwtest(pdOls2)
pbsytest(pdOls2, test="ar")
pbgtest(pdOls2)
pwtest(pdOls3)
pbsytest(pdOls3, test="ar")
pbgtest(pdOls3) # model with lag1+2 and time effects presents no serial correlation

## test for contemporaneous correlation
pcdtest(pdOls3, test="lm") # contemporaneous correlation is present

## bp test for homoskedasticity
bptest(pdOls3) # heteroskedasticity-robuts coefs required


# Export all 6 models side by side, original values (replicates Table 4)
coeftest
stargazer(olsMean, olsMean2, nbMean, nbMean2,
          coeftest(pdOls3, vcovHC(pdOls3, method="arellano")),
          coeftest(pdOls4, vcovHC(pdOls4, method="arellano")),
          decimal.mark = ".", no.space = T, align = T, digits = 2,
          type="html")  # panel data models use heterosk-robust coefs via coeftest



# PLOT STANDARDIZED COEFFICIENTS (Replicates Figure 4)
df.olsMeanss <- tidy(olsMeanss)
df.olsMeanss <- cbind(df.olsMeanss, confint_tidy(olsMeanss))
df.olsMeanss$model <- "OLS"
df.olsMeanss$term<- c("Intercept", "Contig.", "Distance", "GDP (log)", "Trade (log)", "N. IGOs",
                       "N.Agreem.(log)", "N.BR.Res.(log)", "G7", "BRICS", "Mercosul","CPLP",
                       "Post Trad.", "Post Rank", "Lang.")

df.nbs <- tidy(nbMeans)
df.nbs <- cbind(df.nbs, confint_tidy(nbMeans))
df.nbs$model <- "Neg.Bin."
df.nbs$term <- c("Intercept", "Contig.", "Distance", "GDP (log)", "Trade (log)","N. IGOs",
                 "N.Agreem.(log)", "N.BR.Res.(log)", "G7", "BRICS", "Mercosul","CPLP",
                 "Post Trad.", "Post Rank", "Lang.")

pdOls5 <- plm(pdOlsForm3, data=pdc, model = "pooling") # same as pdOls3, but using centered independent variables
df.pd <-tidy(coeftest(pdOls5, vcovHC(pdOls5, method="arellano")))
confint(coeftest(pdOls5, vcovHC(pdOls5, method="arellano")))
df.pd <- cbind(df.pd, confint_tidy(coeftest(pdOls5, vcovHC(pdOls5, method="arellano"))))
df.pd$model <- "P.OLS"
df.pd$term <- c("Intercept", "Contig.", "Distance", "GDP (log)", "Trade (log)", "N. IGOs",
                "N.Agreem.(log)", "N.BR.Res.(log)", "G7", "BRICS", "Mercosul","CPLP",
                "Post Rank", "Lang.", "Post Trad.", "t", "LDV1", "LDV2",
                "d2009", "d2010", "d2011", "d2012", "d2013", "d2014")



df.olsnbpd <- rbind(df.olsMeanss, df.nbs)
df.olsnbpd <- rbind(df.olsnbpd, df.pd)
df.olsnbpd.s <- df.olsnbpd[-c(49:54),]
df.olsnbpd.s$p <- ifelse(df.olsnbpd.s$p.value<0.05,1,0)
df.olsnbpd.s$order <- c(rep(1:15,2),1:12,14,15,13,16:18)
df.olsnbpd.s$term <- reorder(df.olsnbpd.s$term, -df.olsnbpd.s$order)

ggplot(df.olsnbpd.s, aes(x=term, y=estimate, shape=model, alpha=p))+
  geom_point(position=position_dodge(0.6), size=2)+
  geom_linerange(data=df.olsnbpd.s, aes(ymin=conf.low,ymax=conf.high), position=position_dodge(0.6))+
  geom_hline(yintercept = 0)+
  scale_shape_manual(name="Model", values=c("circle", "square", "triangle"))+
  scale_alpha(range = c(0.3,1), guide=F)+
  coord_flip()+
  theme(legend.position = c(0.8, 0.2))+
  xlab("Determinant")+ylab(expression(paste("Estimate (", italic("b"), ")")))


# INTERACTIVE MODEL (Annex)
pdOlsForm3i <- nPers_ln ~ contig + distw +
  gdp_bi_ln + trade_ln + n_igos +
  spRelct_ln + nBrRes_ln + nBrRes_ln*gdp_bi_ln +
  g7 + brics + mercosul + cplp +
  rankmean + lang_ob + emb12 + t + nPers_l1_ln + nPers_l2_ln + as.factor(year) # same as 3, with interctive term

pdOls3i <- plm(pdOlsForm3i, data=pdc, model = "pooling") # on centered data

# Plot marginal effects
meplot <- function(model,var1,var2,int,vcov,ci=.95,
                   xlab=var2,ylab=paste("Marginal Effect of",var1),
                   main="Marginal Effect Plot",
                   me_lty=1,me_lwd=1,me_col="black",
                   ci_lty=1,ci_lwd=.5,ci_col="black",
                   yint_lty=2,yint_lwd=1,yint_col="black"){
  require(ggplot2)
  alpha <- 1-ci
  z <- qnorm(1-alpha/2)
  beta.hat <- coef(model)
  cov <- vcov
  z0 <- seq(min(model.frame(model)[,var2],na.rm=T),max(model.frame(model)[,var2],na.rm=T),length.out=1000)
  dy.dx <- beta.hat[var1] + beta.hat[int]*z0
  se.dy.dx <- sqrt(cov[var1,var1] + z0^2*cov[nrow(cov),ncol(cov)] + 2*z0*cov[var1,ncol(cov)])
  upr <- dy.dx + z*se.dy.dx
  lwr <- dy.dx - z*se.dy.dx
  ggplot(data=NULL,aes(x=z0, y=dy.dx)) +
    labs(x=xlab,y=ylab,title=main) +
    geom_line(aes(z0, dy.dx),size = me_lwd, 
              linetype = me_lty, 
              color = me_col) +
    geom_line(aes(z0, lwr), size = ci_lwd, 
              linetype = ci_lty, 
              color = ci_col) +
    geom_line(aes(z0, upr), size = ci_lwd, 
              linetype = ci_lty, 
              color = ci_col) +
    geom_hline(yintercept=0,linetype=yint_lty,
               size=yint_lwd,
               color=yint_col)
}
meplot(model=pdOls3i, var1="gdp_bi_ln", var2="nBrRes_ln", int="gdp_bi_ln:nBrRes_ln", vcov=vcov(pdOls3i),
       xlab="N. BR. Residents (log)", ylab="Marginal Effect of GDP bi (log)"
       )
